**********************************
* Created by: David Zhang/Jason Premo
* Date: 9/7/2016
*
* Reviewed by:
* Date:
*
* Description: Creates the SCPC and DCPC numbers for the Townsend paper.
* The asset, liability, and income sections of this code are identical
* to those in the old version, which was originally created by David Zhang
* and then heavily modified by Jason Premo. However, the DCPC section in
* this file uses the new, cleaned dataset.
*
* Filepath: /shared/sdsjxs/Townsend/scripts/scpc_dcpc.do
*
********************************

clear all
set more off
cap log close
set maxvar 30000

*AUDIT COMMENT: global value for suffix to audit replication output files
global suffix ""
local scpc_path "/shared/sdsjxs/SCPC/Current_datasets/"
local dcpc_path "/shared/sdsjxs/DCPC/Current_datasets/2012"
local out_path "/shared/sdsjxs/Townsend/results"
local aux_path "/shared/sdsjxs/Townsend/rawdata/auxiliary"
local script_path "/shared/sdsjxs/Townsend/scripts"

* 18+ population share and number of households (numerator and denominator)
* are both from Haver analytics.
local hhconversion = (235036000/114960000)

******************************** SCPC 2012 *************************************

// BALANCE SHEET
use "`scpc_path'/scpc_2012.dta", clear


**** Assets ****

* Use a cutoff of 5 million to clean these values (some respondents may not have seen that we ask for numbers in thousands) and this was fixed in 2015
replace non_house_assets = non_house_assets*1000 if non_house_assets<=3000
replace house_market_value = house_market_value*1000 if house_market_value<=3000
*replace non_house_assets = 0 if missing(non_house_assets)
*replace house_market_value = 0 if missing(house_market_value)
*Get rid of non-homeowners
replace house_market_value = 0 if de013 == 2

*Clean the assets to deal with households that have been sampled multiple times, and therefore
*double (or triple, quadrouple...) reported their household assets/debts.
gen nm_house1 = !missing(house_market_value)
gen nm_nonhouse1 = !missing(non_house_assets)

split prim_key, parse(":")
destring prim_key1, replace
destring prim_key2, replace
gen counter = 1
bysort prim_key1: egen hh_members = total(counter)

gen house_temp = house_market_value
replace house_temp = . if house_market_value == 0
bysort prim_key1: egen avg_house = mean(house_temp)
bysort prim_key1: replace avg_house = avg_house[_n+1] if missing(avg_house) & !missing(avg_house[_n+1])
bysort prim_key1: replace avg_house = avg_house[_n-1] if missing(avg_house) & !missing(avg_house[_n-1])

gen nonhouse_temp = non_house_assets
replace nonhouse_temp=. if non_house_assets == 0
bysort prim_key1: egen avg_nonhouse = mean(nonhouse_temp)
bysort prim_key1: replace avg_nonhouse = avg_nonhouse[_n+1] if missing(avg_nonhouse) & !missing(avg_nonhouse[_n+1])
bysort prim_key1: replace avg_nonhouse = avg_nonhouse[_n-1] if missing(avg_nonhouse) & !missing(avg_nonhouse[_n-1])

bysort prim_key1: egen nm_house = max(nm_house1)
bysort prim_key1: egen nm_nonhouse = max(nm_nonhouse1)

replace avg_house = 0 if missing(avg_house) & nm_house == 1
replace avg_nonhouse = 0 if missing(avg_nonhouse) & nm_nonhouse == 1

* Not sure about weights here. Need to ask Marcin
*AUDIT COMMENT: Did you verify this? The r_weight is what we typically use so this seems correct to me.
*FINALIZATION: Yes, the question is whether or not taking an average of individual weights is appropriate for producing a household level
* weight. The answer is "no", but I haven't yet figured out a better method.
collapse (sum) csh_total_hh=csh_total (mean) non_house_assets_hh=avg_nonhouse house_market_value_hh=avg_house r_weight hh_members, by(prim_key1)

*Note that hh_members below is the number of household members we have in the survey,
*not the number of members actually in the household.
replace csh_total_hh = csh_total*`hhconversion'/hh_members

egen tot_assets_hh = rowtotal(non_house_assets_hh house_market_value_hh csh_total_hh)

collapse (mean) csh_total_hh non_house_assets_hh house_market_value_hh (p50) med_assets = tot_assets_hh [pw=r_weight]
xpose, varname clear
rename _varname varname
order varname
replace v1 = round(v1)
export excel using "`out_path'/balance_sheet$suffix.xlsx", sheet(dcpc_assets) sheetreplace



**** Liabilities ****

use "`scpc_path'/scpc_2012.dta", clear

*Mortgage debt (this used to be 5000... I lowered it. Seems more reasonable)
replace de015 = de015*1000 if de015<=3000
*Non-mortgage debt
replace de019 = de019*1000 if de019<=3000

*Get rid of non-homeowners
replace de015 = 0 if de013 == 2

*pu009 == 2 means non-revolvers
replace cc_debt_revolver_amnt = 0 if pu009==2

*Clean the debts a bit to deal with households that have been sampled multiple times
* and therefore double (or triple, quadruple...) reported their household assets/debts.
*Find households that have been sampled multipled times
gen nm_mortgage1 = !missing(de015)
gen nm_nonmort1 = !missing(de019)

split prim_key, parse(":")
destring prim_key1, replace
destring prim_key2, replace
gen counter = 1
bysort prim_key1: egen hh_members = total(counter)

*Adjust all other debt by cc debt. Note that this creates some negative values. For now, I'll simply take these
*values as reported and hope the errors come out in the wash.
replace de019 =  de019-pu010 if !mi(pu010)

*Take an average, fill in missing/zero values within-household with the mean of remaining reported values.
gen mort_temp = de015
replace mort_temp = . if de015 == 0
bysort prim_key1: egen avg_mort = mean(mort_temp)
bysort prim_key1: replace avg_mort = avg_mort[_n+1] if missing(avg_mort) & !missing(avg_mort[_n+1])
bysort prim_key1: replace avg_mort = avg_mort[_n-1] if missing(avg_mort) & !missing(avg_mort[_n-1])

gen nonmort_temp = de019
replace nonmort_temp = . if de019 == 0
bysort prim_key1: egen avg_nonmort = mean(nonmort_temp)
bysort prim_key1: replace avg_nonmort = avg_nonmort[_n+1] if missing(avg_nonmort) & !missing(avg_nonmort[_n+1])
bysort prim_key1: replace avg_nonmort = avg_nonmort[_n-1] if missing(avg_nonmort) & !missing(avg_nonmort[_n-1])

bysort prim_key1: egen nm_mortgage = max(nm_mortgage1)
bysort prim_key1: egen nm_nonmort = max(nm_nonmort1)

*AUDIT COMMENT: I'm not sure if you have a reason for doing it this way, but you may not want to impute missings with 0s. When taking averages, it is generally our practice
* to exclude people who didn't answer rather than assume that a non-answer equates to 0. If you don't do this (comment out lines 131,132, 138, 139 and 140)
*it changes the value of mortagage_hh in the dcpc_liabilities sheet of the output from 64321.11 to 139093 and the otherdebts_hh from 45404.34 to 67349.53
* FINALIZATION: The replacement with zero is necessary, but you're right that it wasn't quite done correctly.
* I've fixed it now.

replace avg_mort = 0 if missing(avg_mort) & nm_mortgage == 1
replace avg_nonmort = 0 if missing(avg_nonmort) & nm_nonmort == 1

*I'm not sure if this is a technically correct way to handle the weights... need to ask Marcin
*AUDIT COMMENT: Again, should make sure these are the right weights
collapse (sum) cc_debt_revolver_amnt_hh = cc_debt_revolver_amnt (mean) mortgage_hh=avg_mort otherdebts_hh=avg_nonmort r_weight hh_members, by(prim_key1)

replace cc_debt_revolver_amnt_hh = cc_debt_revolver_amnt_hh*`hhconversion'/hh_members

egen tot_debt_hh = rowtotal(cc_debt_revolver_amnt_hh mortgage_hh otherdebts_hh)

collapse (mean) tot_debt_hh cc_debt_revolver_amnt_hh mortgage_hh otherdebts_hh (p50) med_liabilities=tot_debt_hh [pw=r_weight]
xpose, varname clear
rename _varname varname
order varname
replace v1 = round(v1)
export excel using "`out_path'/balance_sheet$suffix.xlsx", sheet(dcpc_liabilities) sheetreplace

***** Income *****

******* Income imputation ********
* In order to deal with the problem of our censored income data, I use the SCF
* to impute incomes for our respondents. The setup for this is done here. 

use /shared/sdsjxs/Townsend/rawdata/scf/p13i6.dta, clear
drop J*
qui describe, varlist
gen aw = X42001

*See scf_clean for an explanation of this.
foreach item in `r(varlist)' {
	qui replace `item' = 0 if `item' == -1
	qui replace `item' = . if `item' < -1 
	}

*Model candidates: mfp, mvrs

*Available demographics in both the SCF and SCPC:
* Sex (X8021 = 1 is male, 2 is female)
* DOB (X5907 = month, X7003 = day, X5908 = year)
* Marital status (X7019 < 44 in SCF)
* Race (X6809 X6810 X6811 X6812 X6813 X6814)
* Education
* Income brackets (of course)
*bysort XX1: gen unique = 1 if _n == 1
*drop if unique != 1


gen male = X8021 == 1
replace male = . if X8021 == 0
gen age = 2012 - X5908
gen age2 = age^2
gen married = X7019 < 44
replace married = . if X7019 > 77
gen white = X6809 == 1
gen black = X6809 == 2
gen hispanic = X6809 == 3 | X7004 == 1
gen asian_other = X6809 == .
gen high_school = X5902 == 1
gen vocational_school = X5903 == 1 | X5905 == 10
gen associates_degree = X5905 >= 1
gen bachelors_degree = X5905 >= 2
gen postgrad_degree = X5905 >= 4 & X5905 != 10
egen income_bracket = cut(X5729), at(0,4999, 7499, 9999, 12499, 14999, 19999, 24999, 29999, 34999, 39999, 49999, 59999, 74999, 99999, 124999, 199999,180000000) icodes
replace income_bracket = income_bracket+1
gen income = X5729
gen log_income = log(income)
*Note: This caps out at 12
gen num_in_hh = X7001 

*Because of the nonlinearity in incomes, I do one regression for people below 200k in annual income and another for those above

local reg_vars age age2 male married white black hispanic asian_other high_school vocational_school associates_degree bachelors_degree postgrad_degree num_in_hh
local cat_vars income_bracket

regress log_income `reg_vars' i.(`cat_vars') if income < 200000

tempfile scf
save `scf', replace

use "`scpc_path'/scpc_2012.dta", clear

* Combine familyincome variables
replace familyincome = 14  if familyincome_part2==1
replace familyincome = 15  if familyincome_part2==2
replace familyincome = 16  if familyincome_part2==3
replace familyincome = 17  if familyincome_part2==4

* Create/rename other demographic variables.
gen num_in_hh = householdmembers
gen income_bracket = familyincome
gen age2 = age^2
gen asian_other = asian == 1 | other == 1
gen high_school = inlist(highesteducation, 9, 10)
gen associates_degree = highesteducation == 12
gen vocational_school = highesteducation == 11
gen bachelors_degree = highesteducation == 13
gen postgrad_degree = highesteducation >= 14

predict log_income_l if income_bracket != 17
gen hhincome = exp(log_income_l)

tempfile scpc
save `scpc', replace

* Do the regression for incomes over 200k
use `scf', clear
regress log_income `reg_vars' i.(`cat_vars') if income >= 200000
use `scpc', clear

* Create imputed incomes
predict log_income_h if income_bracket == 17
replace hhincome = exp(log_income_h) if income_bracket == 17

gen hhincome_cen = hhincome if income_bracket != 17

* Incomes are now created. Need to do an adjustment for households which were sampled multiple times.
split prim_key, parse(":")
destring prim_key1, replace
destring prim_key2, replace

collapse (mean) hhincome hhincome_cen r_weight, by(prim_key1)

collapse (mean) hhincome hhincome_cen (p50) med_income=hhincome [pw=r_weight]
xpose, varname clear
rename _varname varname
order varname
replace v1 = round(v1)
export excel using "`out_path'/balance_sheet$suffix.xlsx", sheet(dcpc_income) sheetreplace



*** DCPC expenditures
import excel using "`aux_path'/DCPC Merchant Types vs Mi Codes.xlsx", firstrow clear
rename DCPCMerchantTypeCode merch

merge 1:m merch using "`dcpc_path'/dcpc_2012_v2.dta"

drop if month(date) != 10

rename amnt amount

bysort prim_key diary_day: gen first = 1 if _n == 1
bysort prim_key: egen days_active = total(first)

replace type = 2 if merch == 26
drop if amount == 0

forvalues code = 1(1)13 {
	ge m`code'_amnt = amount if MiCode == `code' & !missing(pi)
	gen m`code'_cur_amnt = amount if MiCode == `code' & inlist(type, 1, 2, 4) & pi == 1
	gen m`code'_cc_amnt = amount if MiCode == `code' & inlist(type, 1, 2, 4) & pi == 3
	gen m`code'_dda_amnt = amount if MiCode == `code' & inlist(type, 1, 2, 4) & inlist(pi, 2, 4, 5, 7, 8, 9, 10)
	gen m`code'_pp_amnt = amount if MiCode == `code' & inlist(type, 1, 2, 4) & inlist(pi, 6, 11, 12)
	}

gen total = amount if inlist(type, 1, 2, 4) & !missing(pi)
gen total_notax = amount if inlist(type, 1, 2, 4) & !missing(pi) & missing(m11_amnt)

collapse (sum) total* *amnt (mean) ind_weight days_active, by(prim_key)

su ind_weight, d

*Rescaling the weights so they sum to 1
gen weight_new = ind_weight/(`r(sum)'*days_active)

collapse (sum) total* *amnt [pw=weight_new]


xpose, varname clear
rename _varname micode
rename v1 amount
order micode

*Total average yearly spending for a respondent
replace amount = round(amount*365*`hhconversion')

recast long amount
export excel using "`out_path'/balance_sheet.xlsx", sheet(dcpc_expenditure) sheetreplace






*******DCPC Cash flow numbers
*import excel using "`aux_path'/DCPC Merchant Types vs Mi Codes.xlsx", firstrow clear
*rename DCPCMerchantTypeCode merch

*merge 1:m merch using "`dcpc_path'/dcpc_2012_v2.dta"
use "`dcpc_path'/dcpc_2012_v2.dta", clear
*rename MiCode micode

drop if month(date) != 10

bysort prim_key diary_day: gen first = _n == 1
bysort prim_key: egen days_active = total(first)


*** Expenditures
gen double all_cons = amnt if type == 1 & merch != 26
replace all_cons = .09*amnt if type == 1 & merch == 35
gen double cur_cons = all_cons if pi == 1
replace cur_cons = all_cons/2 if pi == 9
gen double dda_cons = all_cons if inlist(pi, 2, 4, 5, 7, 8, 12)
replace dda_cons = all_cons/2 if pi == 9
gen double cc_cons = all_cons if inlist(pi, 3, 11)
gen double nfda_cons = all_cons if pi == 6
* No LTA or forex consumption recorded


*****To/From Currency
*I've checked the cash withdrawal "other" responses, and a large proportion of
* them seem to be invalid insofar as this is concerned. However,
* I should go back to the cleaning script and do some recategorization at some point.
gen double dda_to_cur_amnt = amnt if type == 4 & inlist(cw_source, 1, 2)
gen double cc_to_cur_amnt = amnt if type == 4 & cw_source == 5
gen double nfda_to_cur_amnt = amnt if type == 4 & cw_source == 6
gen double forex_to_cur_amnt = amnt if type == 13

gen double cur_inc = amnt if type == 4 & inlist(cw_source, 3, 4, 8) 
*gen lta_to_cur_amnt = amnt if type == 4 & cw_source == 2

gen double cur_to_dda_amnt = amnt if type == 5
*no currency to credit measured
gen double cur_to_nfda_amnt = amnt if type == 2 & pi == 1
gen double cur_to_forex_amnt = amnt if type == 12
*I guess mortgage payments are currency to LTA?
gen double cur_to_lta_amnt = amnt if merch == 26 & pi == 1

*****To/from DDA
*No dda to cc (that we can identify)
gen double dda_to_nfda_amnt = amnt if type == 2 & inlist(pi, 2, 4, 5, 7, 8, 11)
*No dda to forex
gen double dda_to_lta_amnt = amnt if merch == 26 & inlist(pi, 2, 4, 5, 7, 8, 11)
*No nfda to dda
*No LTA to DDA that we can identify here


****To/From credit
gen double cc_to_nfda_amnt = amnt if type == 2 & pi == 3
*no credit to forex
gen double cc_to_lta_amnt = amnt if inlist(pi, 3, 11) & merch == 26
*no nfda to cc
*no lta to cc


****To/from NFDA
*no nfda to forex
gen double nfda_to_lta_amnt = amnt if pi == 6 & merch == 26

****Forex/LTA are both covered as much as they can be



***** New idea: changes in each account balance, as implied by transactions




*Mortgages payments should be bills (Need to do this in the original cleaning script) /*
replace bill = 1 if merch == 26
rename amnt amount
drop if amount == 0

gen all_exp = amount if inlist(type, 1, 4) & !missing(pi) & micode != 11

gen cash_exp = amount if pi == 1 & inlist(type, 1, 4) & micode != 11
gen prepaid_exp = amount if pi == 6 & inlist(type, 1, 2, 4) & micode != 11
gen noncash_money_exp = amount if inlist(type, 1, 2, 4) & inlist(pi, 2, 4, 5, 7, 8, 9, 10, 11, 12)  & micode != 11

gen cash_deposit = amount if type == 6  & micode != 11
gen cash_withdrawal = amount if type == 5  & micode != 11

gen credit_exp = amount if pi == 3  & micode != 11  & micode != 11
gen credit_advance = amount if type == 5 & cw_source == 5   & micode != 11

gen currency_bill = amount if type == 2 & merch == 35 & pi == 1  & micode != 11
gen dda_bill = amount if type == 2 & merch == 35 & inlist(pi, 2, 4, 5, 7, 8, 9) & micode != 11
gen other_bill = amount if type == 2 & merch == 35 & inlist(pi, 6, 11, 12)  & micode != 11

*gen dda_withdrawal = amount if inlist(pi, 2, 4, 5, 7, 8, 9) & inlist(type, 1, 2)  & micode != 11

gen cash_ppl = amount if type == 3 & pi == 1 & micode != 11
*/


**** Update: 3-15-17: We're now using start and end of month balances for
** the DCPC to use in the statement of cash flows
*** Update again: Actually... no. Doesn't make any sense to do this.


collapse (sum) all_cons cur_* dda_* cc_* nfda_* forex_* (mean) ind_weight days_active, by(prim_key)

su ind_weight, d
gen weight_new = ind_weight/(`r(sum)'*days_active)

collapse (sum) all_cons  cur_* dda_* cc_* nfda_* forex_* [pw=weight_new]

xpose, varname clear
rename _varname varname
order varname
rename v1 amount
replace amount = round(amount*31*`hhconversion')

export excel using "`out_path'/balance_sheet$suffix.xlsx", sheet(dcpc_cashflow) sheetreplace




